Business Problem
The situation of Airbnb houses demand in Boston and Beijing. Compared it with the demand of hotels on Booking.com to futher discover the influence of competitor to Airbnb.
Data Sources
Boston and Beijing Airbnb data: http://insideairbnb.com/get-the-data.html Boston and Beijing Booking data: web crawler
Inside Airbnb is an non-commercial and independent dataset of Airbnb global demand, which includes the first-hand data of Airbnb booking history and reviews of users.
Preprocessing Steps
Set up environment
library(tidyverse)
library(magrittr)
library(readr)
library(lubridate)
library(ggthemes)
library(leaflet)
bj_list <- read_csv('BJlistings.csv')
bj_reviews <- read_csv('BJreviews.csv')
bj_calendar <- read_csv('BJcalendar.csv')
bos_list <- read_csv('Boslistings.csv')
bos_reviews <- read_csv('Bosreviews.csv')
bos_calendar <- read_csv('Boscalendar.csv')
Primary Variables Removal
1. remove non neccessary and repeated columns
# step 1
# remove all url, which are NAs
bos_list_NAomit <- bos_list %>%
select(-ends_with("_url"), -experiences_offered,
-host_total_listings_count,-calendar_last_scraped, -country)
bj_list_NAomit <- bj_list %>%
select(-ends_with("_url"), -experiences_offered,
-host_total_listings_count,-calendar_last_scraped, -country)
# step 2
# remove needless columns
bj_list_rmneedlss <- bj_list_NAomit[ ,-c(2:3)]
bos_list_rmneedless <- bos_list_NAomit[,-c(2:3)]
# step 3
# remove columns with over 60% NA observations
bj_list_cleaned <- bj_list_rmneedlss[, -which(colMeans(is.na(bj_list_rmneedlss)) > 0.8)]
bos_list_cleaned <- bos_list_rmneedless[, -which(colMeans(is.na(bos_list_rmneedless)) > 0.6)]
# View the columns we removed for step3
bos_removed_col <- apply(bos_list_rmneedless, 2, function(col)sum(is.na(col))/length(col))
bos_removed_col[bos_removed_col > 0.6]
## neighbourhood_group_cleansed square_feet
## 1.0000000 0.9646421
## weekly_price monthly_price
## 0.9167380 0.9155974
bj_removed_col <- apply(bj_list_rmneedlss, 2, function(col)sum(is.na(col))/length(col))
bj_removed_col[bj_removed_col>0.8]
## neighbourhood_group_cleansed square_feet
## 1.0000000 0.9995878
## weekly_price monthly_price
## 0.9915752 0.9914206
## license jurisdiction_names
## 0.8973566 0.9999742
# step 3 removed for beijing & boston listing: neighbourhood_group_cleansed, square_feet, weekly_price, monthly_price
# special removal only for beijingllisting: license, jurisdiction_names
# though high NA rate but will not be directly removed for beijing listing:
# cleaning_fee,security_deposit (over 60% NA rate)
2. remove $ sign for security_deposit, price, and cleaning_fee
# eliminate $ sign
bj_list_cleaned$security_deposit = gsub("\\$", "", bj_list_cleaned$security_deposit) %>% as.numeric()
bj_list_cleaned$price <- gsub("\\$", "", bj_list_cleaned$price) %>% as.numeric()
bj_list_cleaned$cleaning_fee <- gsub("\\$", "", bj_list_cleaned$cleaning_fee) %>% as.numeric()
bos_list_cleaned$security_deposit <- gsub("\\$", "", bos_list_cleaned$security_deposit) %>% as.numeric()
bos_list_cleaned$price <- gsub("\\$", "", bos_list_cleaned$price) %>% as.numeric()
bos_list_cleaned$cleaning_fee <- gsub("\\$", "", bos_list_cleaned$cleaning_fee) %>% as.numeric()
# $ sign in calendar data
bos_calendar$price <- gsub("\\$", "", bos_calendar$price) %>% as.numeric()
bj_calendar$price <- gsub("\\$", "", bj_calendar$price) %>% as.numeric()
summary(bos_calendar$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 10.0 80.0 144.0 192.6 232.0 999.0 24144
summary(bj_calendar$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0 246.0 371.0 406.1 528.0 999.0 1756475
3. remove % sign for host_response_rate
bos_list_cleaned$host_response_rate <- sapply(bos_list_cleaned$host_response_rate,
function(x) gsub('%','',x))
bj_list_cleaned$host_response_rate <- sapply(bj_list_cleaned$host_response_rate,
function(x) gsub('%','',x))
EDA Part 1
Graph 1
bj_reviews_yr <- bj_reviews %>%
group_by(year) %>%
count(listing_id) %>%
arrange(desc(n))
bos_reviews_yr <- bos_reviews %>%
group_by(year) %>%
count(listing_id) %>%
arrange(desc(n))
# bjl %>% select(id, name, number_of_reviews) %>% arrange(desc(number_of_reviews))
# bjl %>% select(id, name, number_of_reviews) %>% arrange(number_of_reviews)
# 23437 listings have reviews for Beijing
# 23437/38814
#bosl %>% select(id, name, number_of_reviews) %>% #arrange(desc(number_of_reviews))
# 3507 listings have reviews for Boston
# 3507/3585
# the rate of review in boston is higher than beijing
ggplot(bj_reviews_yr,aes(x=as.factor(year),y=n))+
geom_jitter(alpha=0.3,aes(color=n)) +
geom_smooth() +
theme_economist()+
scale_fill_brewer() +ylim(0,250) +labs(title = 'How popular is Airbnb in Beijing', subtitle = 'Number of reviews received for a single listing over years')
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).

ggplot(bos_reviews_yr,aes(x=as.factor(year),y=n))+
geom_jitter(alpha=0.3,aes(color=n)) +
geom_smooth() +
theme_economist()+
scale_fill_brewer() +ylim(0,250) +
labs(title = 'How popular is Airbnb in Boston', subtitle = 'Number of reviews received for a single listing over years')
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Graph 2
bos_list_cleaned_rmna <- bos_list_cleaned %>%
filter(!is.na(review_scores_rating)) %>%
filter(!is.na(host_is_superhost))
bj_list_cleaned_rmna <- bj_list_cleaned %>%
filter(!is.na(review_scores_rating)) %>%
filter(!is.na(host_is_superhost))
bos_list_cleaned_rmna$host_response_rate <- as.numeric(bos_list_cleaned_rmna$host_response_rate)
## Warning: NAs introduced by coercion
bj_list_cleaned_rmna$host_response_rate <- as.numeric(bj_list_cleaned_rmna$host_response_rate)
## Warning: NAs introduced by coercion
summary(bj_list_cleaned_rmna$host_response_rate)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 100.00 100.00 96.54 100.00 100.00 1894
summary(bos_list_cleaned_rmna$host_response_rate)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 99.00 100.00 97.09 100.00 100.00 286
ggplot(bos_list_cleaned_rmna,aes(x=host_response_rate,y=review_scores_rating)) +
geom_jitter(aes(color=as.factor(host_is_superhost)),alpha=0.3) +theme_economist() +labs(title = 'Indicators for SuperHost', subtitle = 'Avg. Rating by Response Rate in Boston')
## Warning: Removed 286 rows containing missing values (geom_point).

ggplot(bj_list_cleaned_rmna,aes(x=host_response_rate,y=review_scores_rating)) +
geom_jitter(aes(color=as.factor(host_is_superhost)),alpha=0.3) +theme_economist() +
labs(title = 'Indicators for SuperHost', subtitle = 'Avg. Rating by Response Rate in Beijing')
## Warning: Removed 1894 rows containing missing values (geom_point).

Graph 3
glimpse(bos_calendar)
## Observations: 1,280,055
## Variables: 10
## $ listing_id <dbl> 5506, 5506, 5506, 5506, 5506, 5506, 5506, 5506, 5506...
## $ date <date> 2019-12-04, 2019-12-05, 2019-12-06, 2019-12-07, 201...
## $ available <lgl> FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRU...
## $ price <dbl> 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, ...
## $ adjusted_price <chr> "$79.00", "$79.00", "$79.00", "$79.00", "$79.00", "$...
## $ minimum_nights <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3...
## $ maximum_nights <dbl> 730, 730, 730, 730, 730, 730, 730, 730, 730, 730, 73...
## $ year <dbl> 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019...
## $ month <dbl> 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, ...
## $ wkd <ord> Wed, Thu, Fri, Sat, Sun, Mon, Tue, Wed, Thu, Fri, Sa...
dim(bos_calendar)
## [1] 1280055 10
summary(bos_calendar$year)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2019 2020 2020 2020 2020 2020
avg_price_bj <- bj_calendar %>%
group_by(date, wkd) %>%
summarize(avgprice= mean(price, na.rm=T))
avg_price_bos <- bos_calendar %>%
group_by(date, wkd) %>%
summarize(avgprice=mean(price,na.rm=T))
ggplot(avg_price_bos, aes(x=wkd, y=avgprice)) +
geom_boxplot(na.rm=T) + geom_jitter(alpha=0.2) +
theme_economist() +
labs(title = 'Price Trends over the weekday & weekends in Boston', subtitle = 'avgprice = Avg. price by day')

ggplot(avg_price_bj,aes(x=wkd,y=avgprice)) +
geom_boxplot(na.rm=T) + geom_jitter(alpha=0.2) +
theme_economist() +
labs(title = 'Price Trends over the weekday & weekends in Beijing', subtitle = 'avgprice = Avg. price by day')

Graph 4
avg_price_bj_2020 <- bj_calendar %>%
filter(year == 2020) %>%
group_by(listing_id, month) %>%
summarize(avgprice= mean(price,na.rm=T))
avg_price_bos_2020 <- bos_calendar %>%
filter(year == 2020) %>%
group_by(listing_id, month) %>%
summarize(avgprice=mean(price,na.rm=T))
ylim_bos<-boxplot.stats(avg_price_bos_2020$avgprice)$stats[c(1, 5)]
ggplot(avg_price_bos_2020, aes(x = factor(month), y=avgprice)) +
geom_boxplot(outlier.shape = NA) +
coord_cartesian(ylim = ylim_bos * 1.5) +
theme_economist() +
labs(title = 'Price Trends over the month in Boston', subtitle = 'avgprice = Avg. price by listing')
## Warning: Removed 681 rows containing non-finite values (stat_boxplot).

ylim_bj<-boxplot.stats(avg_price_bj_2020$avgprice)$stats[c(1, 5)]
ggplot(avg_price_bj_2020, aes(x = factor(month), y=avgprice)) +
geom_boxplot(outlier.shape = NA) +
coord_cartesian(ylim = ylim_bos * 2.4) +
theme_economist() +
labs(title = 'Price Trends over the month in Beijing', subtitle = 'avgprice = Avg. price by listing')
## Warning: Removed 51194 rows containing non-finite values (stat_boxplot).

Graph 5
bosAirbnb <- bos_list_cleaned %>%
mutate(Log1pPrice = log1p(price), transformed_review = bos_list_cleaned$review_scores_rating^5)
bosAirbnb <- bosAirbnb %>% select(-price, -review_scores_rating)
pal <- colorNumeric(palette = rainbow(6), domain = bosAirbnb$Log1pPrice)
leaflet(data = bosAirbnb[is.na(bosAirbnb$Log1pPrice)==FALSE,]) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addCircleMarkers(~longitude, ~latitude,
color = ~pal(Log1pPrice), weight = 1, radius=1.5,
fillOpacity = 1, opacity = 1,
label = paste("Neighbourhood:", bosAirbnb$neighbourhood_cleansed)) %>%
addLegend("bottomright", pal = pal, values = ~Log1pPrice,
title = "Log1pPrice",
opacity = 1)
BJAirbnb <- bj_list_cleaned %>%
mutate(Log1pPrice = log1p(price), transformed_review = bj_list_cleaned$review_scores_rating^5)
BJAirbnb <- BJAirbnb %>% select(-price, -review_scores_rating)
pal <- colorNumeric(palette = rainbow(6), domain = BJAirbnb$Log1pPrice)
leaflet(data = BJAirbnb[is.na(BJAirbnb$Log1pPrice)==FALSE,]) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addCircleMarkers(~longitude, ~latitude,
color = ~pal(Log1pPrice), weight = 1, radius=1.5,
fillOpacity = 1, opacity = 1,
label = paste("Neighbourhood:", BJAirbnb$neighbourhood_cleansed)) %>%
addLegend("bottomright", pal = pal, values = ~Log1pPrice,
title = "Log1pPrice",
opacity = 1)